import pandas as pd
df = pd.read_csv("/Users/soumyaprasadchandra/Downloads/Salary_Dataset_with_Extra_Features.csv")
df.head()
| Rating | Company Name | Job Title | Salary | Salaries Reported | Location | Employment Status | Job Roles | |
|---|---|---|---|---|---|---|---|---|
| 0 | 3.8 | Sasken | Android Developer | 400000 | 3 | Bangalore | Full Time | Android |
| 1 | 4.5 | Advanced Millennium Technologies | Android Developer | 400000 | 3 | Bangalore | Full Time | Android |
| 2 | 4.0 | Unacademy | Android Developer | 1000000 | 3 | Bangalore | Full Time | Android |
| 3 | 3.8 | SnapBizz Cloudtech | Android Developer | 300000 | 3 | Bangalore | Full Time | Android |
| 4 | 4.4 | Appoids Tech Solutions | Android Developer | 600000 | 3 | Bangalore | Full Time | Android |
df.tail()
| Rating | Company Name | Job Title | Salary | Salaries Reported | Location | Employment Status | Job Roles | |
|---|---|---|---|---|---|---|---|---|
| 22765 | 4.7 | Expert Solutions | Web Developer | 200000 | 1 | Bangalore | Full Time | Web |
| 22766 | 4.0 | Nextgen Innovation Labs | Web Developer | 300000 | 1 | Bangalore | Full Time | Web |
| 22767 | 4.1 | Fresher | Full Stack Web Developer | 192000 | 13 | Bangalore | Full Time | Web |
| 22768 | 4.1 | Accenture | Full Stack Web Developer | 300000 | 7 | Bangalore | Full Time | Web |
| 22769 | 3.8 | Thomson Reuters | Associate Web Developer | 300000 | 7 | Bangalore | Full Time | Web |
df.shape
(22770, 8)
df.columns
Index(['Rating', 'Company Name', 'Job Title', 'Salary', 'Salaries Reported',
'Location', 'Employment Status', 'Job Roles'],
dtype='object')
df.duplicated().sum()
0
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22770 entries, 0 to 22769 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 22770 non-null float64 1 Company Name 22770 non-null object 2 Job Title 22770 non-null object 3 Salary 22770 non-null int64 4 Salaries Reported 22770 non-null int64 5 Location 22770 non-null object 6 Employment Status 22770 non-null object 7 Job Roles 22770 non-null object dtypes: float64(1), int64(2), object(5) memory usage: 1.4+ MB
df.describe()
| Rating | Salary | Salaries Reported | |
|---|---|---|---|
| count | 22770.000000 | 2.277000e+04 | 22770.000000 |
| mean | 3.918213 | 6.953872e+05 | 1.855775 |
| std | 0.519675 | 8.843990e+05 | 6.823668 |
| min | 1.000000 | 2.112000e+03 | 1.000000 |
| 25% | 3.700000 | 3.000000e+05 | 1.000000 |
| 50% | 3.900000 | 5.000000e+05 | 1.000000 |
| 75% | 4.200000 | 9.000000e+05 | 1.000000 |
| max | 5.000000 | 9.000000e+07 | 361.000000 |
df.nunique()
Rating 41 Company Name 11261 Job Title 1080 Salary 316 Salaries Reported 82 Location 10 Employment Status 4 Job Roles 11 dtype: int64
objects_cols = list(df.select_dtypes(include=['object']).columns)
num_cols = list(df.select_dtypes(include=['int64', 'float64']).columns)
print("object type columns", objects_cols)
print("Numerical type columns:", num_cols)
object type columns ['Company Name', 'Job Title', 'Location', 'Employment Status', 'Job Roles'] Numerical type columns: ['Rating', 'Salary', 'Salaries Reported']
df['Company Name'].unique()
array(['Sasken', 'Advanced Millennium Technologies', 'Unacademy', ...,
'Unicon Systems', 'Expert Solutions', 'Nextgen Innovation Labs'],
dtype=object)
df['Company Name'].value_counts()
Tata Consultancy Services 271
Amazon 184
Infosys 169
Accenture 150
Cognizant Technology Solutions 144
...
Talent Anywhere 1
WisdmLabs 1
Softdel 1
Dentsu 1
Nextgen Innovation Labs 1
Name: Company Name, Length: 11261, dtype: int64
df['Job Title'].unique()
array(['Android Developer', 'Android Developer - Intern',
'Android Developer - Contractor', ..., 'Web Developer Contractor',
'Full Stack Web Developer', 'Associate Web Developer'],
dtype=object)
df['Job Title'].value_counts()
Software Development Engineer 2351
Android Developer 2029
Software Development Engineer (SDE) 1614
Front End Developer 1412
Test Engineer 1314
...
Java Andriod Developer 1
Java Deceloper 1
Java/J2EE Programmer 1
Java SOA Developer 1
Associate Web Developer 1
Name: Job Title, Length: 1080, dtype: int64
df['Location'].unique()
array(['Bangalore', 'Chennai', 'Hyderabad', 'New Delhi', 'Pune', 'Jaipur',
'Kerala', 'Kolkata', 'Madhya Pradesh', 'Mumbai'], dtype=object)
df['Location'].value_counts()
Bangalore 8264 Hyderabad 4467 New Delhi 4176 Chennai 2458 Pune 2134 Mumbai 749 Kolkata 178 Madhya Pradesh 155 Kerala 108 Jaipur 81 Name: Location, dtype: int64
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import numpy as np
plt.figure(figsize=(15,6))
sns.countplot(df['Location'], data=df, palette='hls')
plt.show()
plt.figure(figsize=(30,20))
plt.pie(df['Location'].value_counts(), labels=df['Location'].value_counts().index, autopct='%1.2f%%',
)
#hfont = {'fontname':'serif', 'weight':'bold'}
plt.title('Location', size=20)
plt.show()
import plotly.express as px
value_counts = df['Location'].value_counts()
fig = px.pie(names = value_counts.index, values = value_counts.values)
fig.update_layout(
title = 'Pie Chart of Location',
title_x = 0.5)
fig.show()
df['Employment Status'].unique()
array(['Full Time', 'Intern', 'Contractor', 'Trainee'], dtype=object)
plt.figure(figsize=(15,6))
sns.countplot(df['Employment Status'], data = df, palette = 'hls')
plt.show()
plt.figure(figsize=(15,6))
plt.pie(df['Employment Status'].value_counts(), labels=df['Employment Status'].value_counts(), autopct='1f.2f%%')
plt.title('Employment Status', size=30)
plt.show()
value_counts =df['Employment Status'].value_counts()
fig = px.pie(names = value_counts.index, values= value_counts.values)
fig.update_layout(title='Pie Chart of Employment Status',
title_x=0.5)
fig.show()
df['Job Title'].unique()
array(['Android Developer', 'Android Developer - Intern',
'Android Developer - Contractor', ..., 'Web Developer Contractor',
'Full Stack Web Developer', 'Associate Web Developer'],
dtype=object)
df['Job Roles'].value_counts()
SDE 8183 Android 2945 Frontend 2163 Java 1858 Testing 1740 IOS 1631 Backend 1194 Web 999 Python 947 Database 865 Mobile 245 Name: Job Roles, dtype: int64
plt.figure(figsize=(15,6))
sns.countplot(df['Job Roles'], data = df, palette = 'hls')
plt.show()
plt.figure(figsize=(30,20))
plt.pie(df['Job Roles'].value_counts(), labels=df['Job Roles'].value_counts().index, autopct='1%.2f%%')
plt.title('Job Roles', size=30)
plt.show()
value_counts = df['Job Roles'].value_counts()
fig = px.pie(names=value_counts.index, values=value_counts.values)
fig.update_layout(title='Pie Chart of Job Roles',
title_x = 0.5)
fig.show()
for i in num_cols:
plt.figure(figsize=(15,6))
sns.histplot(df[i], kde=True, bins= 20, palette='hls')
plt.xticks(rotation=90)
plt.show()
for i in num_cols:
plt.figure(figsize=(15,6))
sns.distplot(df[i], kde=True, bins= 20)
plt.xticks(rotation=90)
plt.show()
for k in num_cols:
plt.figure(figsize=(15,6))
sns.boxplot(df[k], data=df, palette='hls')
plt.xticks(rotation=90)
plt.show()
for col in num_cols:
fig = px.box(df, y = col)
fig.show()
for j in num_cols:
plt.figure(figsize=(15,6))
sns.violinplot(df[j], data=df, palette='hls')
plt.xticks(rotation=90)
plt.show()
for col in num_cols:
fig = px.violin(df, y=col)
fig.show()
for i in num_cols:
if j in num_cols:
if i!=j:
plt.figure(figsize=(15,6))
sns.lineplot(x=df[i], y=df[j], data=df, ci=None, palette='hls')
plt.xticks(rotation=90)
plt.show()
for i in num_cols:
for j in num_cols:
if i!=j:
plt.figure(figsize=(15,6))
sns.scatterplot(x=df[i], y = df[j], data=df, ci=None, palette='hls')
plt.xticks(rotation=90)
plt.show()
from wordcloud import WordCloud
data = ' '.join(df['Company Name'].values)
mycloud = WordCloud(width=800, height=400,
margin=0,
background_color = 'black',
colormap='plasma').generate(data)
plt.figure(figsize=(15,6))
plt.imshow(mycloud, interpolation='bilinear')
plt.axis('off')
plt.margins(x=0,y=0)
plt.show()
import plotly.express as px
from wordcloud import WordCloud
data = ' '.join(df["Company Name"].values)
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(data)
fig = px.imshow(wordcloud, template='ggplot2')
fig.update_layout(title_text='Word Cloud of Company Names', title_x = 0.5)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig.show()
companies = df['Company Name'].value_counts().head(50)
plt.figure(figsize=(12,15))
sns.barplot(y=companies.index, x = companies.values, palette='plasma')
plt.show()
df.groupby('Company Name')['Company Name'].sum().head(20)
Company Name (X,Y,Z) Architecture & Design (X,Y,Z) Architecture & Design (no)name (no)name(no)name(no)name(no)name(no)name(no)name - -- .... .... .Kreate .Kreate.Kreate 1 Crore Projects 1 Crore Projects 10 10 100Plus (India) 100Plus (India) 10Decoders Consultancy Services 10Decoders Consultancy Services10Decoders Cons... 11Signals Technologies 11Signals Technologies 17544 17544 1985 1985 1Digitals 1Digitals 1Gen 1Gen1Gen 1K Kirana Bazaar 1K Kirana Bazaar1K Kirana Bazaar 1Tab 1Tab 1mg 1mg1mg1mg1mg1mg1mg1mg1mg 1st Risk Solutions 1st Risk Solutions 2018 2018 20K Group 20K Group Name: Company Name, dtype: object
12510
22502
22817
15228
12254
12863